library(GSEABase)
library(Matrix)
library(ggplot2)
library(SiPSiC)

# Constants Definition
minimalClusterSize <- 10
logScalingConstant <- 1
minNumOfGenesExpressed <- 1000

filterData <- function(dataMatrix, isLogTPM, convertToCPM)
{
  filteredDataMatrix <- dataMatrix
  
  if (isLogTPM == TRUE)
  {
    filteredDataMatrix <- 2^(filteredDataMatrix) - logScalingConstant
  }
  
  # Filtering out cells which express less than the minimal number of genes
  expressedGenesCounters <- apply(filteredDataMatrix != 0, 2, sum)
  cellsWithAThousandPlus <- expressedGenesCounters >= minNumOfGenesExpressed
  filteredDataMatrix <- filteredDataMatrix[, cellsWithAThousandPlus]
  expressedGenesCounters <- expressedGenesCounters[cellsWithAThousandPlus]
  
  # Filtering out genes which are expressed by less than the minimal expected cluster size of cells
  nonZeroCellCountsForGenes <- apply(filteredDataMatrix != 0, 1, sum)
  totalCellsCount <- ncol(filteredDataMatrix)
  minNumOfCellsInClust <- totalCellsCount * (minimalClusterSize / 100)
  genesWithMinExpression <- (nonZeroCellCountsForGenes > minNumOfCellsInClust)
  filteredDataMatrix <- filteredDataMatrix[genesWithMinExpression,]
  
  # Converting the transcript counts to CPM
  if (convertToCPM == TRUE)
  {
    countSumsOfCells <- apply(filteredDataMatrix, 2, sum)
    filteredDataMatrix <- t(filteredDataMatrix)
    filteredDataMatrix <- (filteredDataMatrix / countSumsOfCells) * 1000000
    filteredDataMatrix <- t(filteredDataMatrix)
  }
  
  return (filteredDataMatrix)
}


# This function produces graphic representation of the pathway score differences between the cell lineages
executePathwayCalculations <- function(inputPathway, dataMatrix, tS1_cellNames, tS2_cellNames, tS3_cellNames, malignantCellNames)
{
  pathwayGenes <- inputPathway@geneIds
  pathwayName <- inputPathway@setName
  pathwayScores <- try(getPathwayScores(dataMatrix, pathwayGenes))

  scoresAsDataFrame <- as.data.frame(pathwayScores$pathwayScore)
  colnames(scoresAsDataFrame)[1] <- "Score"

  currPathwayScores <- scoresAsDataFrame[malignantCellNames, "Score"]
  names(currPathwayScores) <- malignantCellNames

  allPathwayScores <<- rbind(allPathwayScores, currPathwayScores)
  rownames(allPathwayScores)[nrow(allPathwayScores)] <<- pathwayName

  scoresAsDataFrame$MaligCellType <- "tS1"
  scoresAsDataFrame[rownames(scoresAsDataFrame) %in% tS2_cellNames, "MaligCellType"] <- "tS2"
  scoresAsDataFrame[rownames(scoresAsDataFrame) %in% tS3_cellNames, "MaligCellType"] <- "tS3"

  # Performing the T test to compare the cells of the different lineages
  T.TestResult <- pairwise.t.test(scoresAsDataFrame$Score, scoresAsDataFrame$MaligCellType, p.adjust.method = "none")

  # Fetching and storing P values of the T test
  tS1_vs_tS2_Pval <- T.TestResult[[3]]["tS2", "tS1"]
  tS2_vs_tS3_Pval <- T.TestResult[[3]]["tS3", "tS2"]
  tS1_vs_tS3_Pval <- T.TestResult[[3]]["tS3", "tS1"]
  
  all_P_Values_tS1_vs_tS2[pathwayName] <<- tS1_vs_tS2_Pval
  all_P_Values_tS2_vs_tS3[pathwayName] <<- tS2_vs_tS3_Pval
  all_P_Values_tS1_vs_tS3[pathwayName] <<- tS1_vs_tS3_Pval

  # Calculating and storing effect sizes for all group comparisons
  tS1_Scores <- scoresAsDataFrame[scoresAsDataFrame[,"MaligCellType"] == "tS1","Score"]
  tS2_Scores <- scoresAsDataFrame[scoresAsDataFrame[,"MaligCellType"] == "tS2","Score"]
  tS3_Scores <- scoresAsDataFrame[scoresAsDataFrame[,"MaligCellType"] == "tS3","Score"]
  
  tS1_median <- median(tS1_Scores)
  tS2_median <- median(tS2_Scores)
  tS3_median <- median(tS3_Scores)
  
  all_effect_sizes_tS1_vs_tS2[pathwayName] <<- tS1_median - tS2_median
  all_effect_sizes_tS2_vs_tS3[pathwayName] <<- tS2_median - tS3_median
  all_effect_sizes_tS1_vs_tS3[pathwayName] <<- tS1_median - tS3_median

  violinPlot <- ggplot(scoresAsDataFrame, aes(x = MaligCellType, y = Score, fill = MaligCellType)) +
    ggtitle(paste0("T-Test tS1 vs. tS2 (Unadjusted!): P < ", tS1_vs_tS2_Pval, "\n",
                   "T-Test tS2 vs. tS3 (Unadjusted!): P < ", tS2_vs_tS3_Pval, "\n",
                   "T-Test tS1 vs. tS3 (Unadjusted!): P < ", tS1_vs_tS3_Pval, "\n",
                   "Effect size tS1 vs. tS2: ", all_effect_sizes_tS1_vs_tS2[pathwayName], "\n",
                   "Effect size tS2 vs. tS3: ", all_effect_sizes_tS2_vs_tS3[pathwayName], "\n",
                   "Effect size tS1 vs. tS3: ", all_effect_sizes_tS1_vs_tS3[pathwayName])) +
    geom_violin(trim=FALSE) + geom_boxplot(width=0.1)

  pdf(paste0(pathwayName, ".pdf"))
  print(violinPlot)
  dev.off()
}


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
############################################# MAIN #############################################
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

setwd("")
GMT_FILE_NAME <- "h.all.v7.0.symbols.pluscc.gmt"
hallmarkGenesets <- getGmt(GMT_FILE_NAME)

cells_metadata <- read.delim(file = "GSE131907_Lung_Cancer_cell_annotation.txt", header = TRUE, sep = "\t")
cells_metadata <- cells_metadata[!is.na(cells_metadata[,"Cell_subtype"]) &
                                 !is.na(cells_metadata[,"Sample_Origin"]),]
cancerCellsMeta <- cells_metadata[(cells_metadata[,"Sample_Origin"] != "nLung") &
                                  (cells_metadata[,"Sample_Origin"] != "nLN"),]
isOfRelevantLineages <- cancerCellsMeta[,"Cell_subtype"] %in% c("tS1", "tS2", "tS3")
cancerCellsMeta <- cancerCellsMeta[isOfRelevantLineages,]
relevantCellBarcodes <- cancerCellsMeta[,"Index"]

# Reading all cell transcriptomes and only keeping cells of the tumorigenic lineages tS1 through tS3
allCellsData <- readRDS(file = "GSE131907_Lung_Cancer_normalized_log2TPM_matrix.rds")
allCellsData <- allCellsData[, relevantCellBarcodes]

allCellsData <- filterData(allCellsData, isLogTPM = TRUE, convertToCPM = FALSE)
cancerCellsMeta <- cancerCellsMeta[(cancerCellsMeta[,"Index"] %in% colnames(allCellsData)),]
allCellsData <- Matrix(as.matrix(allCellsData), sparse = TRUE)

tS1_cellNames <- cancerCellsMeta[(cancerCellsMeta[,"Cell_subtype"] == "tS1"),"Index"]
tS2_cellNames <- cancerCellsMeta[(cancerCellsMeta[,"Cell_subtype"] == "tS2"),"Index"]
tS3_cellNames <- cancerCellsMeta[(cancerCellsMeta[,"Cell_subtype"] == "tS3"),"Index"]
malignantCellNames <- colnames(allCellsData)

all_P_Values_tS1_vs_tS2 <- numeric()
all_P_Values_tS2_vs_tS3 <- numeric()
all_P_Values_tS1_vs_tS3 <- numeric()

all_effect_sizes_tS1_vs_tS2 <- numeric()
all_effect_sizes_tS2_vs_tS3 <- numeric()
all_effect_sizes_tS1_vs_tS3 <- numeric()

allPathwayScores <- numeric()

lapply(hallmarkGenesets, executePathwayCalculations, allCellsData, tS1_cellNames, tS2_cellNames, tS3_cellNames, malignantCellNames)

write.csv2(all_effect_sizes_tS1_vs_tS2, file = "Effect_Size_tS1_vs_tS2.csv")
write.csv2(all_effect_sizes_tS2_vs_tS3, file = "Effect_Size_tS2_vs_tS3.csv")
write.csv2(all_effect_sizes_tS1_vs_tS3, file = "Effect_Size_tS1_vs_tS3.csv")

write.csv2(all_P_Values_tS1_vs_tS2, file = "Unadjusted_p_values_tS1_vs_tS2.csv")
FDR_Adjusted_P_Vals <- p.adjust(all_P_Values_tS1_vs_tS2, method = "BH", n = length(all_P_Values_tS1_vs_tS2))
write.csv2(FDR_Adjusted_P_Vals, file = "FDR_values_tS1_vs_tS2.csv")

write.csv2(all_P_Values_tS2_vs_tS3, file = "Unadjusted_p_values_tS2_vs_tS3.csv")
FDR_Adjusted_P_Vals <- p.adjust(all_P_Values_tS2_vs_tS3, method = "BH", n = length(all_P_Values_tS2_vs_tS3))
write.csv2(FDR_Adjusted_P_Vals, file = "FDR_values_tS2_vs_tS3.csv")

write.csv2(all_P_Values_tS1_vs_tS3, file = "Unadjusted_p_values_tS1_vs_tS3.csv")
FDR_Adjusted_P_Vals <- p.adjust(all_P_Values_tS1_vs_tS3, method = "BH", n = length(all_P_Values_tS1_vs_tS3))
write.csv2(FDR_Adjusted_P_Vals, file = "FDR_values_tS1_vs_tS3.csv")

saveRDS(allPathwayScores, file = "SiPSiC_allPathwayScores.RDS")